from ple.games.flappybird import FlappyBird
from ple import PLE
import matplotlib.pyplot as plt
import os
import numpy as np
import moviepy.editor as mpy
import math
import copy
from collections import defaultdict
from IPython.display import Image, display
pygame 1.9.6 Hello from the pygame community. https://www.pygame.org/contribute.html couldn't import doomish Couldn't import doom
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.5
def make_anim(images, fps=60, true_image=False):
duration = len(images) / fps
def make_frame(t):
try:
x = images[int(len(images) / duration * t)]
except:
x = images[-1]
if true_image:
return x.astype(np.uint8)
else:
return ((x + 1) / 2 * 255).astype(np.uint8)
clip = mpy.VideoClip(make_frame, duration=duration)
clip.fps = fps
return clip
%matplotlib inline
os.environ["SDL_VIDEODRIVER"] = "dummy" # this line disable pop-out window
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False) # environment interface to game
env.reset_game()
# return a dictionary whose key is action description and value is action index
print(game.actions)
# return a list of action index (include None)
print(env.getActionSet())
{'up': 119}
[119, None]
# a dictionary describe state
'''
player y position.
players velocity.
next pipe distance to player
next pipe top y position
next pipe bottom y position
next next pipe distance to player
next next pipe top y position
next next pipe bottom y position
'''
game.getGameState()
{'player_y': 256,
'player_vel': 0,
'next_pipe_dist_to_player': 309.0,
'next_pipe_top_y': 144,
'next_pipe_bottom_y': 244,
'next_next_pipe_dist_to_player': 453.0,
'next_next_pipe_top_y': 160,
'next_next_pipe_bottom_y': 260}
class Agent:
def __init__(self,
bucket_range_per_feature,
num_action,
t=0,
discount_factor=0.99):
self.update_parameters(t) # init explore rate and learning rate
self.q_table = defaultdict(lambda: np.zeros(num_action))
self.discount_factor = discount_factor
self.num_action = num_action
# how to discretize each feature in a state
# the higher each value, less time to train but with worser performance
# e.g. if range = 2, feature with value 1 is equal to feature with value 0 bacause int(1/2) = int(0/2)
self.bucket_range_per_feature = bucket_range_per_feature
def select_action(self, state):
# epsilon-greedy
state_idx = self.get_state_idx(state)
if np.random.rand() < self.exploring_rate:
action = np.random.choice(num_action) # Select a random action
else:
action = np.argmax(
self.q_table[state_idx]) # Select the action with the highest q
return action
def update_policy(self, state, action, reward, state_prime):
state_idx = self.get_state_idx(state)
state_prime_idx = self.get_state_idx(state_prime)
# Update Q_value using Q-learning update rule
best_q = np.max(self.q_table[state_prime_idx])
self.q_table[state_idx][action] += self.learning_rate * (
reward + self.discount_factor * best_q - self.q_table[state_idx][action])
def get_state_idx(self, state):
# instead of using absolute position of pipe, use relative position
state = copy.deepcopy(state)
state['next_next_pipe_bottom_y'] -= state['player_y']
state['next_next_pipe_top_y'] -= state['player_y']
state['next_pipe_bottom_y'] -= state['player_y']
state['next_pipe_top_y'] -= state['player_y']
# sort to make list converted from dict ordered in alphabet order
state_key = [k for k, v in sorted(state.items())]
# do bucketing to decrease state space to speed up training
state_idx = []
for key in state_key:
state_idx.append(
int(state[key] / self.bucket_range_per_feature[key]))
return tuple(state_idx)
def update_parameters(self, episode):
self.exploring_rate = max(MIN_EXPLORING_RATE,
min(0.5, 0.99**((episode) / 30)))
self.learning_rate = max(MIN_LEARNING_RATE, min(0.5, 0.99
** ((episode) / 30)))
def shutdown_explore(self):
# make action selection greedy
self.exploring_rate = 0
num_action = len(env.getActionSet())
bucket_range_per_feature = {
'next_next_pipe_bottom_y': 40,
'next_next_pipe_dist_to_player': 512,
'next_next_pipe_top_y': 40,
'next_pipe_bottom_y': 20,
'next_pipe_dist_to_player': 20,
'next_pipe_top_y': 20,
'player_vel': 4,
'player_y': 16
}
# init agent
agent = Agent(bucket_range_per_feature, num_action)
reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# update agent
agent.update_policy(state, action, reward, state_prime)
# Setting up for the next iteration
state = state_prime
t += 1
# update exploring_rate and learning_rate
agent.update_parameters(episode)
if episode % print_every_episode == 0:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
agent.exploring_rate,
agent.learning_rate
))
reward_per_epoch.append(cum_reward)
exploring_rates.append(agent.exploring_rate)
learning_rates.append(agent.learning_rate)
lifetime_per_epoch.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5
Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 64 time steps, cumulated reward: -5.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 46 time steps, cumulated reward: -5.0, exploring rate: 0.04147740932356356, learning rate: 0.5
Episode 10000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 281 time steps, cumulated reward: 1.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 48 time steps, cumulated reward: -5.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5
t: 25%|██▌ | 42/166 [00:00<00:00, 418.84it/s, now=None]
Episode 15000 finished after 165 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 len frames: 166 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 298 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 529 time steps, cumulated reward: 8.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 148 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 288 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5
t: 14%|█▍ | 45/325 [00:00<00:00, 440.40it/s, now=None]
Episode 20000 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 len frames: 325 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 85 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 209 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 438 time steps, cumulated reward: 5.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 182 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 1382 time steps, cumulated reward: 31.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 1942 time steps, cumulated reward: 45.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 586 time steps, cumulated reward: 9.0, exploring rate: 0.01, learning rate: 0.5
t: 5%|▍ | 40/813 [00:00<00:01, 393.32it/s, now=None]
Episode 25000 finished after 812 time steps, cumulated reward: 15.0, exploring rate: 0.01, learning rate: 0.5 len frames: 813 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 2661 time steps, cumulated reward: 64.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 1942 time steps, cumulated reward: 45.0, exploring rate: 0.01, learning rate: 0.5 Episode 27500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 28000 finished after 437 time steps, cumulated reward: 5.0, exploring rate: 0.01, learning rate: 0.5 Episode 28500 finished after 184 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 29000 finished after 476 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 29500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5
t: 10%|▉ | 39/402 [00:00<00:00, 386.35it/s, now=None]
Episode 30000 finished after 401 time steps, cumulated reward: 4.0, exploring rate: 0.01, learning rate: 0.5 len frames: 402 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 30500 finished after 1716 time steps, cumulated reward: 39.0, exploring rate: 0.01, learning rate: 0.5 Episode 31000 finished after 1192 time steps, cumulated reward: 25.0, exploring rate: 0.01, learning rate: 0.5 Episode 31500 finished after 3524 time steps, cumulated reward: 87.0, exploring rate: 0.01, learning rate: 0.5 Episode 32000 finished after 1349 time steps, cumulated reward: 30.0, exploring rate: 0.01, learning rate: 0.5 Episode 32500 finished after 929 time steps, cumulated reward: 19.0, exploring rate: 0.01, learning rate: 0.5 Episode 33000 finished after 2664 time steps, cumulated reward: 64.0, exploring rate: 0.01, learning rate: 0.5 Episode 33500 finished after 5119 time steps, cumulated reward: 130.0, exploring rate: 0.01, learning rate: 0.5 Episode 34000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 34500 finished after 663 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5
t: 2%|▏ | 46/2522 [00:00<00:05, 450.73it/s, now=None]
Episode 35000 finished after 2521 time steps, cumulated reward: 61.0, exploring rate: 0.01, learning rate: 0.5 len frames: 2522 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 35500 finished after 2887 time steps, cumulated reward: 70.0, exploring rate: 0.01, learning rate: 0.5 Episode 36000 finished after 7258 time steps, cumulated reward: 187.0, exploring rate: 0.01, learning rate: 0.5 Episode 36500 finished after 2071 time steps, cumulated reward: 49.0, exploring rate: 0.01, learning rate: 0.5 Episode 37000 finished after 1115 time steps, cumulated reward: 23.0, exploring rate: 0.01, learning rate: 0.5 Episode 37500 finished after 3686 time steps, cumulated reward: 92.0, exploring rate: 0.01, learning rate: 0.5 Episode 38000 finished after 409 time steps, cumulated reward: 5.0, exploring rate: 0.01, learning rate: 0.5 Episode 38500 finished after 330 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 39000 finished after 663 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 39500 finished after 2749 time steps, cumulated reward: 67.0, exploring rate: 0.01, learning rate: 0.5
def demo():
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# shutdown exploration to see performance of greedy action
agent.shutdown_explore()
# the initial state
state = game.getGameState()
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# observe the result
state_prime = game.getGameState() # get next state
# Setting up for the next iteration
state = state_prime
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
demo()
t: 3%|▎ | 39/1319 [00:00<00:03, 384.97it/s, now=None]
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
# plot life time against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(lifetime_per_epoch)), lifetime_per_epoch)
fig.tight_layout()
plt.show()
# plot reward against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(reward_per_epoch)), reward_per_epoch)
plt.show()
agent = Agent(bucket_range_per_feature, num_action)
reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# update agent
agent.update_policy(state, action, reward, state_prime)
# Setting up for the next iteration
state = state_prime
t += 1
# update exploring_rate and learning_rate
agent.update_parameters(episode)
if episode % print_every_episode == 0 or cum_reward > 100:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
agent.exploring_rate,
agent.learning_rate
))
reward_per_epoch.append(cum_reward)
exploring_rates.append(agent.exploring_rate)
learning_rates.append(agent.learning_rate)
lifetime_per_epoch.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0 or cum_reward > 100:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 59 time steps, cumulated reward: -5.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5
Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 77 time steps, cumulated reward: -4.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 108 time steps, cumulated reward: -3.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.04147740932356356, learning rate: 0.5
t: 30%|███ | 41/135 [00:00<00:00, 404.46it/s, now=None]
Episode 10000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 135 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 77 time steps, cumulated reward: -4.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5
Episode 15000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 178 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 299 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5
t: 26%|██▌ | 34/133 [00:00<00:00, 337.93it/s, now=None]
Episode 20000 finished after 132 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 len frames: 133 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 676 time steps, cumulated reward: 12.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 111 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 220 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 178 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 289 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 97 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5
t: 6%|▌ | 39/700 [00:00<00:01, 385.68it/s, now=None]
Episode 25000 finished after 699 time steps, cumulated reward: 12.0, exploring rate: 0.01, learning rate: 0.5 len frames: 700 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 27018 finished after 4130 time steps, cumulated reward: 103.0, exploring rate: 0.01, learning rate: 0.5 len frames: 4131 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-272f53a5ac0d> in <module> 68 print("len frames:", len(frames)) 69 clip = make_anim(frames, fps=60, true_image=True).rotate(-90) ---> 70 display(clip.ipython_display(fps=60, autoplay=1, loop=1)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in ipython_display(clip, filetype, maxduration, t, fps, rd_kwargs, center, **html_kwargs) 219 220 return HTML2(html_embed(clip, filetype=filetype, maxduration=maxduration, --> 221 center=center, rd_kwargs=rd_kwargs, **html_kwargs)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 106 107 return html_embed(filename, maxduration=maxduration, rd_kwargs=rd_kwargs, --> 108 center=center, **html_kwargs) 109 110 filename = clip ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 141 if duration > maxduration: 142 raise ValueError("The duration of video %s (%.1f) exceeds the 'maxduration' "%(filename, duration)+ --> 143 "attribute. You can increase 'maxduration', by passing 'maxduration' parameter" 144 "to ipython_display function." 145 "But note that embedding large videos may take all the memory away !") ValueError: The duration of video __temp__.mp4 (68.8) exceeds the 'maxduration' attribute. You can increase 'maxduration', by passing 'maxduration' parameterto ipython_display function.But note that embedding large videos may take all the memory away !
def demo():
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# shutdown exploration to see performance of greedy action
agent.shutdown_explore()
# the initial state
state = game.getGameState()
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# observe the result
state_prime = game.getGameState() # get next state
# Setting up for the next iteration
state = state_prime
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
demo()
t: 2%|▏ | 38/1843 [00:00<00:04, 375.26it/s, now=None]
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
# plot life time against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(lifetime_per_epoch)), lifetime_per_epoch)
fig.tight_layout()
plt.show()
# plot reward against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(reward_per_epoch)), reward_per_epoch)
plt.show()
class SARSA_Agent:
def __init__(self,
bucket_range_per_feature,
num_action,
t=0,
discount_factor=0.99):
self.update_parameters(t) # init explore rate and learning rate
self.q_table = defaultdict(lambda: np.zeros(num_action))
self.discount_factor = discount_factor
self.num_action = num_action
# how to discretize each feature in a state
# the higher each value, less time to train but with worser performance
# e.g. if range = 2, feature with value 1 is equal to feature with value 0 bacause int(1/2) = int(0/2)
self.bucket_range_per_feature = bucket_range_per_feature
def select_action(self, state):
# epsilon-greedy
state_idx = self.get_state_idx(state)
if np.random.rand() < self.exploring_rate:
action = np.random.choice(num_action) # Select a random action
else:
action = np.argmax(
self.q_table[state_idx]) # Select the action with the highest q
return action
def update_policy(self, state, action, reward, state_prime, action_prime):
state_idx = self.get_state_idx(state)
state_prime_idx = self.get_state_idx(state_prime)
# Update Q_value using SARSA update rule
q_of_state_prime_and_action_prime = self.q_table[state_prime_idx][action_prime]
self.q_table[state_idx][action] += self.learning_rate * (
reward + self.discount_factor * q_of_state_prime_and_action_prime - self.q_table[state_idx][action])
def get_state_idx(self, state):
# instead of using absolute position of pipe, use relative position
state = copy.deepcopy(state)
state['next_next_pipe_bottom_y'] -= state['player_y']
state['next_next_pipe_top_y'] -= state['player_y']
state['next_pipe_bottom_y'] -= state['player_y']
state['next_pipe_top_y'] -= state['player_y']
# sort to make list converted from dict ordered in alphabet order
state_key = [k for k, v in sorted(state.items())]
# do bucketing to decrease state space to speed up training
state_idx = []
for key in state_key:
state_idx.append(
int(state[key] / self.bucket_range_per_feature[key]))
return tuple(state_idx)
def update_parameters(self, episode):
self.exploring_rate = max(MIN_EXPLORING_RATE,
min(0.5, 0.99**((episode) / 30)))
self.learning_rate = max(MIN_LEARNING_RATE, min(0.5, 0.99
** ((episode) / 30)))
def shutdown_explore(self):
# make action selection greedy
self.exploring_rate = 0
%matplotlib inline
os.environ["SDL_VIDEODRIVER"] = "dummy" # this line disable pop-out window
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False) # environment interface to game
env.reset_game()
num_action = len(env.getActionSet())
bucket_range_per_feature = {
'next_next_pipe_bottom_y': 40,
'next_next_pipe_dist_to_player': 512,
'next_next_pipe_top_y': 40,
'next_pipe_bottom_y': 20,
'next_pipe_dist_to_player': 20,
'next_pipe_top_y': 20,
'player_vel': 4,
'player_y': 16
}
# init agent
sarsa_agent = SARSA_Agent(bucket_range_per_feature, num_action)
reward_per_epoch_sarsa = []
lifetime_per_epoch_sarsa = []
exploring_rates_sarsa = []
learning_rates_sarsa = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
sarsa_agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
# select the first action on init state
action = sarsa_agent.select_action(state)
while not env.game_over():
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# get next action on state_prime
action_prime = sarsa_agent.select_action(state_prime)
# update sarsa_agent
sarsa_agent.update_policy(state, action, reward, state_prime, action_prime)
# Setting up for the next iteration
state = state_prime
action = action_prime
t += 1
# update exploring_rate and learning_rate
sarsa_agent.update_parameters(episode)
if episode % print_every_episode == 0:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
sarsa_agent.exploring_rate,
sarsa_agent.learning_rate
))
reward_per_epoch_sarsa.append(cum_reward)
exploring_rates_sarsa.append(sarsa_agent.exploring_rate)
learning_rates_sarsa.append(sarsa_agent.learning_rate)
lifetime_per_epoch_sarsa.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 60 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 61 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 70 time steps, cumulated reward: -4.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5
Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 44 time steps, cumulated reward: -5.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.04147740932356356, learning rate: 0.5
Episode 10000 finished after 75 time steps, cumulated reward: -4.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 76 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 110 time steps, cumulated reward: -3.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 147 time steps, cumulated reward: -2.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 133 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 141 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5
t: 12%|█▏ | 44/361 [00:00<00:00, 434.24it/s, now=None]
Episode 15000 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 len frames: 361 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 145 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 586 time steps, cumulated reward: 9.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 39 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 47 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5
t: 6%|▌ | 27/474 [00:00<00:01, 267.02it/s, now=None]
Episode 20000 finished after 473 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 len frames: 474 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 514 time steps, cumulated reward: 7.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 177 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 105 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 160 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 171 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5
t: 8%|▊ | 48/587 [00:00<00:01, 471.72it/s, now=None]
Episode 25000 finished after 586 time steps, cumulated reward: 9.0, exploring rate: 0.01, learning rate: 0.5 len frames: 587 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 1038 time steps, cumulated reward: 21.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 417 time steps, cumulated reward: 5.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 27500 finished after 253 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 28000 finished after 65 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 28500 finished after 145 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 29000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 29500 finished after 624 time steps, cumulated reward: 10.0, exploring rate: 0.01, learning rate: 0.5
t: 33%|███▎ | 44/135 [00:00<00:00, 433.00it/s, now=None]
Episode 30000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 len frames: 135 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 30500 finished after 514 time steps, cumulated reward: 7.0, exploring rate: 0.01, learning rate: 0.5 Episode 31000 finished after 850 time steps, cumulated reward: 16.0, exploring rate: 0.01, learning rate: 0.5 Episode 31500 finished after 1305 time steps, cumulated reward: 28.0, exploring rate: 0.01, learning rate: 0.5 Episode 32000 finished after 358 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 32500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 33000 finished after 901 time steps, cumulated reward: 18.0, exploring rate: 0.01, learning rate: 0.5 Episode 33500 finished after 2058 time steps, cumulated reward: 48.0, exploring rate: 0.01, learning rate: 0.5 Episode 34000 finished after 473 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 34500 finished after 2744 time steps, cumulated reward: 67.0, exploring rate: 0.01, learning rate: 0.5
t: 3%|▎ | 46/1568 [00:00<00:03, 451.88it/s, now=None]
Episode 35000 finished after 1567 time steps, cumulated reward: 35.0, exploring rate: 0.01, learning rate: 0.5 len frames: 1568 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 35500 finished after 296 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 36000 finished after 2132 time steps, cumulated reward: 50.0, exploring rate: 0.01, learning rate: 0.5 Episode 36500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 37000 finished after 359 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 37500 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 38000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 38500 finished after 776 time steps, cumulated reward: 14.0, exploring rate: 0.01, learning rate: 0.5 Episode 39000 finished after 257 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 39500 finished after 2170 time steps, cumulated reward: 51.0, exploring rate: 0.01, learning rate: 0.5
def demo():
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# shutdown exploration to see performance of greedy action
sarsa_agent.shutdown_explore()
# the initial state
state = game.getGameState()
while not env.game_over():
# select an action
action = sarsa_agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# observe the result
state_prime = game.getGameState() # get next state
# Setting up for the next iteration
state = state_prime
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
demo()
t: 4%|▍ | 44/1003 [00:00<00:02, 432.91it/s, now=None]
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
# plot life time against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(lifetime_per_epoch_sarsa)), lifetime_per_epoch_sarsa)
fig.tight_layout()
plt.show()
# plot reward against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(reward_per_epoch_sarsa)), reward_per_epoch_sarsa)
plt.show()
%matplotlib inline
os.environ["SDL_VIDEODRIVER"] = "dummy" # this line disable pop-out window
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False) # environment interface to game
env.reset_game()
num_action = len(env.getActionSet())
bucket_range_per_feature = {
'next_next_pipe_bottom_y': 40,
'next_next_pipe_dist_to_player': 512,
'next_next_pipe_top_y': 40,
'next_pipe_bottom_y': 20,
'next_pipe_dist_to_player': 20,
'next_pipe_top_y': 20,
'player_vel': 4,
'player_y': 16
}
# init agent
sarsa_agent = SARSA_Agent(bucket_range_per_feature, num_action)
reward_per_epoch_sarsa = []
lifetime_per_epoch_sarsa = []
exploring_rates_sarsa = []
learning_rates_sarsa = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
sarsa_agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
# select the first action on init state
action = sarsa_agent.select_action(state)
while not env.game_over():
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# get next action on state_prime
action_prime = sarsa_agent.select_action(state_prime)
# update sarsa_agent
sarsa_agent.update_policy(state, action, reward, state_prime, action_prime)
# Setting up for the next iteration
state = state_prime
action = action_prime
t += 1
# update exploring_rate and learning_rate
sarsa_agent.update_parameters(episode)
if episode % print_every_episode == 0 or cum_reward > 100:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
sarsa_agent.exploring_rate,
sarsa_agent.learning_rate
))
reward_per_epoch_sarsa.append(cum_reward)
exploring_rates_sarsa.append(sarsa_agent.exploring_rate)
learning_rates_sarsa.append(sarsa_agent.learning_rate)
lifetime_per_epoch_sarsa.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0 or cum_reward > 100:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 60 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5
Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 72 time steps, cumulated reward: -4.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 65 time steps, cumulated reward: -5.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 75 time steps, cumulated reward: -4.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.04147740932356356, learning rate: 0.5
t: 42%|████▏ | 42/99 [00:00<00:00, 414.12it/s, now=None]
Episode 10000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 99 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 67 time steps, cumulated reward: -4.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 112 time steps, cumulated reward: -3.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 101 time steps, cumulated reward: -4.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 91 time steps, cumulated reward: -4.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 63 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 130 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5
Episode 15000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4 Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 136 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 75 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 64 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5
t: 5%|▍ | 10/212 [00:00<00:02, 97.89it/s, now=None]
Episode 20000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 len frames: 212 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 1002 time steps, cumulated reward: 20.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 208 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5
t: 5%|▌ | 33/618 [00:00<00:01, 327.42it/s, now=None]
Episode 25000 finished after 617 time steps, cumulated reward: 10.0, exploring rate: 0.01, learning rate: 0.5 len frames: 618 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 1242 time steps, cumulated reward: 27.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 27500 finished after 901 time steps, cumulated reward: 18.0, exploring rate: 0.01, learning rate: 0.5 Episode 28000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 28500 finished after 1228 time steps, cumulated reward: 26.0, exploring rate: 0.01, learning rate: 0.5 Episode 29000 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 29500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5
t: 2%|▏ | 41/1717 [00:00<00:04, 405.03it/s, now=None]
Episode 30000 finished after 1716 time steps, cumulated reward: 39.0, exploring rate: 0.01, learning rate: 0.5 len frames: 1717 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 30500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 31000 finished after 1228 time steps, cumulated reward: 26.0, exploring rate: 0.01, learning rate: 0.5
t: 1%| | 42/5374 [00:00<00:12, 418.11it/s, now=None]
Episode 31343 finished after 5373 time steps, cumulated reward: 136.0, exploring rate: 0.01, learning rate: 0.5 len frames: 5374 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-7-326bbfa0e126> in <module> 72 print("len frames:", len(frames)) 73 clip = make_anim(frames, fps=60, true_image=True).rotate(-90) ---> 74 display(clip.ipython_display(fps=60, autoplay=1, loop=1)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in ipython_display(clip, filetype, maxduration, t, fps, rd_kwargs, center, **html_kwargs) 219 220 return HTML2(html_embed(clip, filetype=filetype, maxduration=maxduration, --> 221 center=center, rd_kwargs=rd_kwargs, **html_kwargs)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 106 107 return html_embed(filename, maxduration=maxduration, rd_kwargs=rd_kwargs, --> 108 center=center, **html_kwargs) 109 110 filename = clip ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 141 if duration > maxduration: 142 raise ValueError("The duration of video %s (%.1f) exceeds the 'maxduration' "%(filename, duration)+ --> 143 "attribute. You can increase 'maxduration', by passing 'maxduration' parameter" 144 "to ipython_display function." 145 "But note that embedding large videos may take all the memory away !") ValueError: The duration of video __temp__.mp4 (89.6) exceeds the 'maxduration' attribute. You can increase 'maxduration', by passing 'maxduration' parameterto ipython_display function.But note that embedding large videos may take all the memory away !
def demo():
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# shutdown exploration to see performance of greedy action
sarsa_agent.shutdown_explore()
# the initial state
state = game.getGameState()
while not env.game_over():
# select an action
action = sarsa_agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# observe the result
state_prime = game.getGameState() # get next state
# Setting up for the next iteration
state = state_prime
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
demo()
t: 2%|▏ | 38/1871 [00:00<00:04, 379.70it/s, now=None]
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
# plot life time against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(lifetime_per_epoch_sarsa)), lifetime_per_epoch_sarsa)
fig.tight_layout()
plt.show()
# plot reward against training episodes
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(reward_per_epoch_sarsa)), reward_per_epoch_sarsa)
plt.show()
In this lab, I do some experiments by running Q-Learning algorithm and SARSA algorithm.
First, I run Q-Learning and SARSA for 40000 episodes and record the play time and reward.
I find out that in the 40000 episodes, Q-Learning can actually learn faster than SARSA. Since the average lifetime of Q-Learning is obviously higher than SARSA. And the average reward is too.
For the second experiment, I just want to show those games that has reward over than 100 in both algorithm. However, in the process, I ecounter the problem mention by TA, which means the agents play for too long.
Then I show demo for both algorithm. From my observation, I think that when SARSA plays, the bird will 'prepare' for next pipe early, to prevent the 'cliffs'. When Q-Learning plays, the bird flies a little bit risky.